Add CLI normalizer #503

gaurav · 2025-08-28T21:39:04Z

We have several requests/needs for files containing CURIEs to be normalized in bulk. There are multiple ways of doing this using NodeNorm, but it would be nice to have something that can do it as an INNER JOIN against the combined DuckDB database we create while building NodeNorm, as that should be way faster than other approaches. We could also use this to export every mapping we have from a particular source (i.e. https://github.com/TranslatorSRI/NodeNormalization/issues/321).

WIP. Should be merged after PR #495.

diff --git c/src/babel_utils.py i/src/babel_utils.py index a96120d..5cbab9c 100644 --- c/src/babel_utils.py +++ i/src/babel_utils.py @@ -5,13 +5,15 @@ from enum import Enum from ftplib import FTP from io import BytesIO import gzip -from datetime import datetime as dt +from datetime import datetime as dt, datetime from datetime import timedelta import time import requests import os import urllib import jsonlines +import yaml + from src.node import NodeFactory, SynonymFactory, DescriptionFactory, InformationContentFactory, TaxonFactory from src.util import Text, get_config from src.LabeledID import LabeledID @@ -349,10 +351,11 @@ def get_numerical_curie_suffix(curie): return None -def write_compendium(synonym_list,ofname,node_type,labels={},extra_prefixes=[],icrdf_filename=None): +def write_compendium(metadata_yamls, synonym_list,ofname,node_type,labels={},extra_prefixes=[],icrdf_filename=None): """ + :param metadata_yaml: The YAML files containing the metadata for this compendium. :param synonym_list: - :param ofname: + :param ofname: Output filename. A file with this filename will be created in both the `compendia` and `synonyms` output directories. :param node_type: :param labels: A map of identifiers Not needed if each identifier will have a label in the correct directory (i.e. downloads/PMID/labels for PMID:xxx). @@ -371,6 +374,32 @@ def write_compendium(synonym_list,ofname,node_type,labels={},extra_prefixes=[],i node_factory = NodeFactory(make_local_name(''),biolink_version) synonym_factory = SynonymFactory(make_local_name('')) + # Write out the metadata.yaml file combining information from all the metadata.yaml files. + metadata_dir = os.path.join(cdir,'metadata') + os.makedirs(metadata_dir, exist_ok=True) + with open(os.path.join(cdir, ofname + '.yaml'), 'w') as outf: + metadata = { + 'type': 'compendium', + 'name': ofname, + 'created_at': datetime.now().isoformat(), + 'concords': {} + } + for metadata_yaml in metadata_yamls: + metadata_block = yaml.safe_load(metadata_yaml) + if metadata_block is None: + raise ValueError("Metadata file {metadata_yaml} is empty.") + + metadata_name = metadata_block['name'] + + if metadata_name in metadata['concords']: + logging.error(f"Duplicate metadata block name {metadata_name}!") + logging.error(f"New metadata block from {metadata_yaml}: {metadata_block}!") + logging.error(f"Existing metadata block: {metadata['concords'][metadata_name]}!") + raise ValueError(f"Metadata file {metadata_yaml} is named {metadata_name}, but this has already been loaded.") + metadata['concords'][metadata_name] = metadata_block + + outf.write(yaml.dump(metadata)) + # Load the preferred_name_boost_prefixes -- this tells us which prefixes to boost when # coming up with a preferred label for a particular Biolink class. preferred_name_boost_prefixes = config['preferred_name_boost_prefixes']

diff --git c/src/babel_utils.py i/src/babel_utils.py index f973337..59a5360 100644 --- c/src/babel_utils.py +++ i/src/babel_utils.py @@ -14,6 +14,7 @@ import urllib import jsonlines import yaml +from src.metadata.provenance import write_combined_metadata from src.node import NodeFactory, SynonymFactory, DescriptionFactory, InformationContentFactory, TaxonFactory from src.util import Text, get_config from src.LabeledID import LabeledID @@ -559,44 +560,17 @@ def write_compendium(metadata_yamls, synonym_list, ofname, node_type, labels={}, exit() # Write out the metadata.yaml file combining information from all the metadata.yaml files. - metadata_dir = os.path.join(cdir,'metadata') - os.makedirs(metadata_dir, exist_ok=True) - with open(os.path.join(cdir, 'metadata', ofname + '.yaml'), 'w') as outf: - # TODO: move into metadata/provenance.py - metadata = { - 'type': 'compendium', - 'name': ofname, - 'created_at': datetime.now().isoformat(), - 'counts': { - 'cliques': count_cliques, - 'eq_ids': count_eq_ids, - 'synonyms': count_synonyms, - }, - 'concords': {} - } - for metadata_yaml in metadata_yamls: - with open(metadata_yaml, 'r') as metaf: - metadata_block = yaml.safe_load(metaf) - if metadata_block is None or metadata_block == {}: - raise ValueError("Metadata file {metadata_yaml} is empty.") - - if 'name' not in metadata_block: - raise ValueError(f"Metadata file {metadata_yaml} is missing a 'name' field: {metadata_block}") - - metadata_name = metadata_block['name'] - - if type(metadata_name) != str: - raise ValueError(f"Metadata file {metadata_yaml} has a 'name' field that is not a string: {metadata_block}") - - if metadata_name in metadata['concords']: - # If it's not already a list, then make it into a list. - if type(metadata['concords'][metadata_name]) != list: - metadata['concords'][metadata_name] = [metadata['concords'][metadata_name]] - metadata['concords'][metadata_name].append(metadata_block) - else: - metadata['concords'][metadata_name] = metadata_block - - yaml.dump(metadata, outf) + write_combined_metadata( + os.path.join(cdir, 'metadata', ofname + '.yaml'), + typ='compendium', + name=ofname, + counts={ + 'cliques': count_cliques, + 'eq_ids': count_eq_ids, + 'synonyms': count_synonyms, + }, + combined_from_filenames=metadata_yamls, + ) def glom(conc_set, newgroups, unique_prefixes=['INCHIKEY'],pref='HP',close={}): """We want to construct sets containing equivalent identifiers. diff --git c/src/createcompendia/drugchemical.py i/src/createcompendia/drugchemical.py index 2de4804..8dee460 100644 --- c/src/createcompendia/drugchemical.py +++ i/src/createcompendia/drugchemical.py @@ -1,5 +1,6 @@ import csv +from src.metadata.provenance import write_combined_metadata, write_concord_metadata from src.node import NodeFactory, InformationContentFactory from src.prefixes import RXCUI, PUBCHEMCOMPOUND, UMLS from src.categories import (CHEMICAL_ENTITY, DRUG, MOLECULAR_MIXTURE, FOOD, COMPLEX_MOLECULAR_MIXTURE, @@ -139,7 +140,7 @@ def get_cui(x,indicator_column,cui_column,aui_column,aui_to_cui,sdui_to_cui): print(x) exit() -def build_rxnorm_relationships(conso, relfile, outfile): +def build_rxnorm_relationships(conso, relfile, outfile, metadata_yaml): """RXNREL is a lousy file. The subject and object can sometimes be a CUI and sometimes an AUI and you have to use CONSO to figure out how to go back and forth. @@ -167,8 +168,32 @@ def build_rxnorm_relationships(conso, relfile, outfile): #This is maybe relying on convention a bit too much. if outfile == "UMLS": prefix = UMLS + sources = [ + { + 'type': 'UMLS', + 'name': 'MRCONSO', + 'filename': conso + }, + { + 'type': 'UMLS', + 'name': 'MRREL', + 'filename': relfile + } + ] else: prefix = RXCUI + sources = [ + { + 'type': 'RXNORM', + 'name': 'RXNCONSO', + 'filename': conso + }, + { + 'type': 'RXNOM', + 'name': 'RXNREL', + 'filename': relfile + } + ] aui_to_cui, sdui_to_cui = get_aui_to_cui(conso) # relfile = os.path.join('input_data', 'private', "RXNREL.RRF") single_use_relations = {"has_active_ingredient": defaultdict(set), @@ -214,6 +239,13 @@ def build_rxnorm_relationships(conso, relfile, outfile): continue outf.write(f"{prefix}:{subject}\t{predicate}\t{prefix}:{next(iter(objects))}\n") + write_concord_metadata( + metadata_yaml, + name='build_rxnorm_relationships()', + description=f'Builds relationships between RxCUI and other identifiers from a CONSO ({conso}) and a REL ({relfile}).', + sources=sources + ) + def load_cliques(compendium): rx_to_clique = {} @@ -228,7 +260,7 @@ def load_cliques(compendium): rx_to_clique[terms["i"]] = clique return rx_to_clique -def build_pubchem_relationships(infile,outfile): +def build_pubchem_relationships(infile,outfile, metadata_yaml): with open(infile,"r") as inf: document = json.load(inf) with open(outfile,"w") as outf: @@ -238,7 +270,19 @@ def build_pubchem_relationships(infile,outfile): for cid in cids: outf.write(f"{RXCUI}:{rxnid}\tlinked\t{PUBCHEMCOMPOUND}:{cid}\n") -def build_conflation(manual_concord_filename, rxn_concord, umls_concord, pubchem_rxn_concord, drug_compendium, chemical_compendia, icrdf_filename, outfilename): + write_concord_metadata( + metadata_yaml, + name='build_pubchem_relationships()', + description=f'Builds relationships between RxCUI and PubChem Compound identifiers from a PubChem annotations file ({infile}.', + sources=[{ + 'type': 'PubChem', + 'name': 'PubChem RxNorm annotations', + 'description': 'PubChem RxNorm mappings generated by pubchem.pull_rxnorm_annotations()', + 'filename': infile + }] + ) + +def build_conflation(manual_concord_filename, rxn_concord, umls_concord, pubchem_rxn_concord, drug_compendium, chemical_compendia, icrdf_filename, outfilename, input_metadata_yamls, output_metadata_yaml): """RXN_concord contains relationshps between rxcuis that can be used to conflate Now we don't want all of them. We want the ones that are between drugs and chemicals, and the ones between drugs and drugs. @@ -556,6 +600,15 @@ def build_conflation(manual_concord_filename, rxn_concord, umls_concord, pubchem outfile.write(f"{json.dumps(final_conflation_id_list)}\n") written.add(fs) + # Write out metadata.yaml + write_combined_metadata( + output_metadata_yaml, + typ='conflation', + name='drugchemical.build_conflation()', + description='Build DrugChemical conflation.', + combined_from_filenames=input_metadata_yamls + ) + def sort_by_curie_suffix(curie): """ diff --git c/src/metadata/provenance.py i/src/metadata/provenance.py index 54bc50e..5a8f703 100644 --- c/src/metadata/provenance.py +++ i/src/metadata/provenance.py @@ -1,3 +1,4 @@ +import os.path from datetime import datetime import yaml @@ -8,13 +9,56 @@ def write_download_metadata(filename, name, url='', description='', sources=None def write_concord_metadata(filename, name, url='', description='', sources=None, counts=None): write_metadata(filename, 'concord', name, url=url, description=description, sources=sources, counts=None) -def write_metadata(filename, typ, name, sources=None, url='', description='', counts=None): - if type(name) != str: +def write_combined_metadata(filename, typ, name, sources=None, url='', description='', counts=None, combined_from_filenames=None): + combined_from = {} + if combined_from_filenames is not None: + for metadata_yaml in combined_from_filenames: + with open(metadata_yaml, 'r') as metaf: + metadata_block = yaml.safe_load(metaf) + if metadata_block is None or metadata_block == {}: + raise ValueError("Metadata file {metadata_yaml} is empty.") + + if 'name' not in metadata_block: + raise ValueError(f"Metadata file {metadata_yaml} is missing a 'name' field: {metadata_block}") + + metadata_name = metadata_block['name'] + + if type(metadata_name) is not str: + raise ValueError(f"Metadata file {metadata_yaml} has a 'name' field that is not a string: {metadata_block}") + + if metadata_name in combined_from: + # If it's not already a list, then make it into a list. + if type(combined_from[metadata_name]) is not list: + combined_from[metadata_name] = [combined_from[metadata_name]] + combined_from[metadata_name].append(metadata_block) + else: + combined_from[metadata_name] = metadata_block + + write_metadata( + filename, + typ=typ, + name=name, + sources=sources, + url=url, + description=description, + counts=counts, + combined_from=combined_from + ) + +def write_metadata(filename, typ, name, sources=None, url='', description='', counts=None, combined_from=None): + if type(typ) is not str: + raise ValueError(f"Metadata entry type must be a string, not {type(typ)}: '{typ}'") + if type(name) is not str: raise ValueError(f"Metadata entry name must be a string, not {type(name)}: '{name}'") if sources is None: sources = [] if counts is None: counts = [] + if combined_from is None: + combined_from = [] + + metadata_dir = os.path.dirname(filename) + os.makedirs(metadata_dir, exist_ok=True) with open(filename, 'w') as fout: yaml.dump({ 'created_at': datetime.now().isoformat(), @@ -24,4 +68,5 @@ def write_metadata(filename, typ, name, sources=None, url='', description='', co 'description': description, 'sources': sources, 'counts': counts, + 'combined_from': combined_from, }, fout) diff --git c/src/snakefiles/drugchemical.snakefile i/src/snakefiles/drugchemical.snakefile index 9640c13..3f6a8d3 100644 --- c/src/snakefiles/drugchemical.snakefile +++ i/src/snakefiles/drugchemical.snakefile @@ -1,6 +1,7 @@ import src.createcompendia.drugchemical as drugchemical import src.synonyms.synonymconflation as synonymconflation import src.snakefiles.util as util +from src.metadata.provenance import write_concord_metadata ### Drug / Chemical @@ -9,39 +10,56 @@ rule rxnorm_relationships: rxnconso = config['download_directory'] + "/RxNorm/RXNCONSO.RRF", rxnrel = config['download_directory'] + "/RxNorm/RXNREL.RRF", output: - outfile_concords = config['intermediate_directory'] + '/drugchemical/concords/RXNORM' + outfile_concords = config['intermediate_directory'] + '/drugchemical/concords/RXNORM', + metadata_yaml = config['intermediate_directory'] + '/drugchemical/concords/metadata-RXNORM.yaml' run: - drugchemical.build_rxnorm_relationships(input.rxnconso, input.rxnrel, output.outfile_concords) + drugchemical.build_rxnorm_relationships(input.rxnconso, input.rxnrel, output.outfile_concords, output.metadata_yaml) rule umls_relationships: input: umlsconso = config['download_directory'] + "/UMLS/MRCONSO.RRF", umlsrel = config['download_directory'] + "/UMLS/MRREL.RRF", output: - outfile_concords = config['intermediate_directory'] + '/drugchemical/concords/UMLS' + outfile_concords = config['intermediate_directory'] + '/drugchemical/concords/UMLS', + metadata_yaml = config['intermediate_directory'] + '/drugchemical/concords/metadata-UMLS.yaml' run: - drugchemical.build_rxnorm_relationships(input.umlsconso, input.umlsrel, output.outfile_concords) + drugchemical.build_rxnorm_relationships(input.umlsconso, input.umlsrel, output.outfile_concords, output.metadata_yaml) rule pubchem_rxnorm_relationships: input: infile = config['download_directory'] + '/PUBCHEM.COMPOUND/RXNORM.json', output: - outfile_concords = config['intermediate_directory'] + '/drugchemical/concords/PUBCHEM_RXNORM' + outfile_concords = config['intermediate_directory'] + '/drugchemical/concords/PUBCHEM_RXNORM', + metadata_yaml = config['intermediate_directory'] + '/drugchemical/concords/metadata-PUBCHEM_RXNORM.yaml' run: - drugchemical.build_pubchem_relationships(input.infile,output.outfile_concords) + drugchemical.build_pubchem_relationships(input.infile,output.outfile_concords, output.metadata_yaml) rule drugchemical_conflation: input: drug_compendium=config['output_directory']+'/compendia/'+'Drug.txt', chemical_compendia=expand("{do}/compendia/{co}", do=config['output_directory'], co=config['chemical_outputs']), rxnorm_concord=config['intermediate_directory']+'/drugchemical/concords/RXNORM', + rxnorm_metadata=config['intermediate_directory']+'/drugchemical/concords/metadata-RXNORM.yaml', umls_concord=config['intermediate_directory']+'/drugchemical/concords/UMLS', + umls_metadata=config['intermediate_directory']+'/drugchemical/concords/metadata-UMLS.yaml', pubchem_concord=config['intermediate_directory']+'/drugchemical/concords/PUBCHEM_RXNORM', + pubchem_metadata=config['intermediate_directory']+'/drugchemical/concords/metadata-PUBCHEM_RXNORM.yaml', drugchemical_manual_concord=config['input_directory']+'/manual_concords/drugchemical.tsv', icrdf_filename=config['download_directory']+'/icRDF.tsv', output: - outfile=config['output_directory']+'/conflation/DrugChemical.txt' + outfile=config['output_directory']+'/conflation/DrugChemical.txt', + metadata_yaml=config['output_directory']+'/conflation/metadata.yaml', + drugchemical_manual_metadata=config['intermediate_directory']+'/drugchemical/concords/metadata-Manual.yaml', run: + write_concord_metadata(input.drugchemical_manual_metadata, + name='Manual DrugChemical Concords', + description='Manually curated DrugChemical conflation cross-references from the Babel repository', + sources=[{ + 'name': 'Babel repository', + 'url': 'https://github.com/TranslatorSRI/Babel', + }], + url='https://github.com/TranslatorSRI/Babel/blob/master/input_data/manual_concords/drugchemical.tsv', + ) drugchemical.build_conflation( input.drugchemical_manual_concord, input.rxnorm_concord, @@ -50,7 +68,13 @@ rule drugchemical_conflation: input.drug_compendium, input.chemical_compendia, input.icrdf_filename, - output.outfile) + output.outfile, + input_metadata_yamls={ + 'RXNORM': input.rxnorm_metadata, + 'UMLS': input.umls_metadata, + 'PUBCHEM_RXNORM': input.pubchem_metadata, + 'Manual': input.drugchemical_manual_metadata, + }, output_metadata_yaml=output.metadata_yaml) rule drugchemical_conflated_synonyms: input:

Since MeSH is not an ids file for proteins, this should only pull in MeSH IDs that are associated with a UMLS ID.

Could also be useful to track memory in the future.

gaurav and others added 30 commits August 2, 2025 21:34

Updated UMLS and RxNorm.

3bacd04

Added counts to metadata.

e800dfd

Added metadata files to build_* methods.

4db83fe

Added metadata YAMLs to Snakemake dependencies.

5d207f2

First stab at metadata files.

8abbb8b

Fixed typo.

c8ed59b

Fixed build_anatomy_umls_relationships() params.

a5af518

Added metadata for CellLine.

519641b

Fixed some metadata output.

0c5bc87

Fixed some issues.

650ebb4

Fixed some cell_line bugs.

2fa76f3

First stab at adding metadata to Chemical.

baf970b

Turned off DRY_RUN for testing.

48c9064

Added metadata outputs for cell_line and anatomy.

e97ac42

Fixed sources.

b444e1c

First stab at Chemical concord metadata.

fe9ed81

Added check for type of name (in case people try passing in an object).

9adc3ff

Fixed metadata.yaml output.

963197c

Updated behavior of concord combinations.

b49384b

Fixed write_concord_metadata() calls.

640fbb7

Added metadata to module targets.

edd78ab

Added target to chemical.

54802a7

Centralized UMLS build_sets() metadata generation.

4e9e1e7

Added provenance metadata to diseasephenotype module.

2fc7053

Added concords to the Gene module.

da2dc1d

Added metadata for genefamilies.

2009778

Added metadata to a method we probably don't use any more.

a44eda1

Added MacromolecularComplex metadata requirement.

188cdd0

gaurav added 30 commits August 17, 2025 18:06

Moved DrugChemical metadata into metadata/

e8ed050

Cleaned up code.

ec78408

Moved one configuration option into its own section.

6a71101

Ack no that's not a temp file.

45e8e8e

Ack no that's not a temp file.

675671e

Added GeneProteinConflated.txt.gz as an output.

eca8284

Tweaked code so it lines up with DrugChemicalConflated.

6dbf5be

Added to various reports.

0546767

Merge branch 'add-geneprotein-conflated-synonyms' into babel-v1.12.0

4cca517

Fixed bug in handling taxa.

531692d

Merge branch 'add-geneprotein-conflated-synonyms' into babel-v1.12.0

96eaa59

Fixed some bugs in actually exporting the ChEBI alternate properties.

675939f

Attempt to fix a bug in KEGG xrefs from ChEBI.

7a53ce6

Added a UMLS-MeSH concord to proteins.

4ffb0ad

Since MeSH is not an ids file for proteins, this should only pull in MeSH IDs that are associated with a UMLS ID.

Added DRUGBANK mappings to UMLS/protein concords.

9495a6d

DuckDB files are temporary, but regenerating them is a pain.

0fe0690

Added comments to document what chemicals.build_compendia() is doing.

14219c3

Could also be useful to track memory in the future.

Merge branch 'babel-v1.12.0' into add-umls-mesh-mappings-for-proteins

d170020

Fixed PropertyList count.

656032e

Fixed PropertyList count.

97fd066

Improved log message.

3afc6f9

Fixed bug in PropertyList count.

30eb4e8

Merge branch 'babel-v1.12.0' into add-umls-mesh-mappings-for-proteins

a304e71

Added DRUGBANK as an extra prefix for the Protein compendia.

472ff59

Fixed minor bug in properties.

55fe9fe

Fixed bug in getting the alternative ID.

9d8a0ce

Merge branch 'babel-v1.12.0' into add-umls-mesh-mappings-for-proteins

2dafc67

Major cleanup of additional CURIEs with labels and properties.

b5731ed

Merge branch 'babel-v1.12.0' into add-umls-mesh-mappings-for-proteins

4a427be

Added DuckDB to the Dockerfile so we can use the CLI.

ece1da8

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Add CLI normalizer #503

Add CLI normalizer #503

Uh oh!

gaurav commented Aug 28, 2025

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

2 participants

Add CLI normalizer #503

Are you sure you want to change the base?

Add CLI normalizer #503

Uh oh!

Conversation

gaurav commented Aug 28, 2025

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

2 participants